• Introduction
    • Sources d'inspiration
    • Liens entre les différentes tables
    • Chargement des librairies
    • Quelques fonctions utilitaires
  • Les données : exploration, cleaning et feature engineering
    • Application
      • Distribution des "TARGET"
      • Traitement des variables catégorielles
      • Feature engineering
      • Imputation des valeurs manquantes
    • Other tables
      • Bureau et Bureau Balance
        • Bureau Balance
          • Traitement des variables catégorielles
        • Bureau
          • Traitement des variables catégorielles
          • Feature engineering et merge
      • Précédentes demandes de crédit
        • Previous application
  • Filtrage des colonnes
    • Suppression des colonnes les plus vides (60%)
      • Imputation des valeurs manquantes
      • Sélection des colonnes les plus importantes
    • Binning des données

Introduction¶

Sources d'inspiration¶

https://www.kaggle.com/code/jsaguiar/lightgbm-with-simple-features [xxxx]

https://www.kaggle.com/code/mathchi/home-credit-risk-with-detailed-feature-engineering/

https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

Liens entre les différentes tables¶

Chargement des librairies¶

In [54]:
import os
import numpy as np
import pandas as pd

from fast_ml import eda

import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.impute import SimpleImputer

import gc
import time
import warnings
warnings.filterwarnings("ignore")

from contextlib import contextmanager

Quelques fonctions utilitaires¶

In [55]:
@contextmanager
def timer(title):
    t0 = time.perf_counter()
    yield
    print("{} - done in {:.0f}s".format(title, time.perf_counter() - t0))

def read_data(csv_file_path):
    df = pd.read_csv(csv_file_path)
    print("Shape of the dataframe :" + str(df.shape))
    reduce_mem_usage(df)
    return df


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type.
        to reduce memory usage.        

        1. Iterate over every column
        2. Determine if the column is numeric
        3. Determine if the column can be represented by an integer
        4. Find the min and the max value
        5. Determine and apply the smallest datatype that can fit the range of values

    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if (col_type != object) and (col_type != 'category'):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))


def show_infos(df):
    cols = df.columns
    if 'TARGET' in cols : 
        df_0 =  df[df.TARGET == 0]
        df_1 = df[df.TARGET == 1]
        null_values = df.isna().sum().values
        null_values_0 = df_0.isna().sum().values
        null_values_1 = df_1.isna().sum().values
        types = df.dtypes.values
        info_df = pd.DataFrame(
                {"name": cols, 'total_null': null_values, "nulls for target=0": null_values_0, 
                "nulls for target=1": null_values_1})
        info_df['% nulls for target=0'] = round(100*info_df['nulls for target=0']/len(df_0), 2)
        info_df['% nulls for target=1'] = round(100*info_df['nulls for target=1']/len(df_1), 2)
        info_df['type'] = types
        info_df = info_df.sort_values(by='total_null', ascending=False)
    else :
        null_values = df.isna().sum().values
        types = df.dtypes.values
        info_df = pd.DataFrame(
                {"name": cols, "nulls": null_values})
        info_df['% nulls'] = round(100*info_df['nulls']/len(df), 2)
        info_df['type'] = types
        info_df = info_df.sort_values(by='nulls', ascending=False)

    return info_df


def show_category_details(df):
    for column in df.select_dtypes('category').columns:
        print(column)
        print(df[column].value_counts(normalize=True, dropna=False))

def category_columns(df):
    categ_col = []
    for column in df.select_dtypes('category').columns:
        categ_col = categ_col.append(column)
    categ_col


def list_emptiest_columns(df, threshold):
    infos = show_infos(df)
    return list(infos[infos['% nulls'] > threshold*100]['name'].values)


def list_unfrequent_category_values(df, frequency):
    unfreq_categ_dict = {}
    for column in df.select_dtypes('category').columns:
        categ_count = pd.DataFrame(
            df[column].value_counts(normalize=True, dropna=False))
        unfreq_categ = categ_count[categ_count[column] < frequency].index
        if len(list(unfreq_categ)) > 1:
            print(column)
            unfreq_categ_dict[column] = list(unfreq_categ)

    return unfreq_categ_dict


def list_overfrequent_category_values(df, frequency):
    freq_categ_dict = {}
    for column in df.select_dtypes('category').columns:
        categ_count = pd.DataFrame(
            df[column].value_counts(normalize=True, dropna=False))
        freq_categ = categ_count[categ_count[column] > frequency].index
        if len(freq_categ) > 0:
            print(column)
            freq_categ_dict[column] = list(freq_categ)
    return freq_categ_dict
    
def list_binary_cat_cols(df):
    binary_cols = [col for col in df if ((df[col].dtypes == 'category') and (
        col != 'TARGET') and (len(df[col].unique()) == 2))]
    return binary_cols


def list_binary_cols(df):
    binary_cols = [col for col in df if ((df[col].dtypes != 'category') and (
        col != 'TARGET') and (len(df[col].unique()) == 2))]
    return binary_cols

def get_numerical_non_binary_columns(df) :
    binary_cols = list_binary_cols(df)
    to_ignore = binary_cols + ['SK_ID_CURR']
    int_columns = [col for col in df if str(df[col].dtypes).startswith('int')]
    # Les colonnes float
    float_columns = [col for col in df if str(df[col].dtypes).startswith('float')]
    num_columns = float_columns + int_columns
    num_features =  [ele for ele in num_columns if ele not in to_ignore]
    return num_features


def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [
        col for col in df.columns if df[col].dtype == 'category']
    df = pd.get_dummies(df, columns=categorical_columns,
                        dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def get_percentiles(data, feature_name):

    target_0_data = data[data['TARGET'] == 0][feature_name].dropna()
    target_1_data = data[data['TARGET'] == 1][feature_name].dropna()
    df = pd.DataFrame(columns=["Percentile", "TARGET", 'Valeur'])
    for i in range(0, 101, 10):
        new_row1 = pd.Series({'Percentile': i, 'TARGET': '0', 'Valeur': np.round(
            np.percentile(target_0_data, i), 3)})
        new_row2 = pd.Series({'Percentile': i, 'TARGET': '1', 'Valeur': np.round(
            np.percentile(target_1_data, i), 3)})
        df = pd.concat([df, new_row1.to_frame().T], ignore_index=True)
        df = pd.concat([df, new_row2.to_frame().T], ignore_index=True)

    return df

    
def get_features_scores(df, features_to_check, test_func):
    fkbest_df = pd.DataFrame(columns = ['feature','score', 'p_value'])
    for col in features_to_check :
        subdf = df[[col,'TARGET']].dropna()
        X = subdf[[col]]
        y = subdf[['TARGET']]
        if test_func == f_classif :
            X = subdf[[col]].values.reshape(-1,1)
            y = subdf[['TARGET']].values.reshape(-1,1)
        score, p_value = test_func(X,y)
        fkbest_df = fkbest_df.append({'feature': col, 'score': score[0], 'p_value': p_value[0]}, ignore_index=True)
    return fkbest_df.sort_values(by='score', ascending=False)
In [56]:
data_dir = '../../data/'

Les données : exploration, cleaning et feature engineering¶

Application¶

In [57]:
application = read_data(data_dir+'application_train.csv')
Shape of the dataframe :(307511, 122)
Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%

Distribution des "TARGET"¶

Le jeu de données est pour le moins déséquilibré.

In [58]:
distrib_target = pd.DataFrame(application.TARGET.value_counts()).reset_index(
).rename(columns={'index': 'target', 'TARGET': 'count'})
In [59]:
fig = px.pie(distrib_target, values='count', names='target')
fig.update_layout(title = 'Distribution des négatifs et des positifs', height=400, width=400)
fig.show()
In [60]:
summary_df = eda.df_info(application)
summary_df.sort_values(by='num_unique_values', ascending=True)
Out[60]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
LIVE_CITY_NOT_WORK_CITY int8 Numerical 2 [0, 1] 0 0.0
FLAG_DOCUMENT_18 int8 Numerical 2 [0, 1] 0 0.0
FLAG_DOCUMENT_14 int8 Numerical 2 [0, 1] 0 0.0
FLAG_MOBIL int8 Numerical 2 [1, 0] 0 0.0
FLAG_EMP_PHONE int8 Numerical 2 [1, 0] 0 0.0
... ... ... ... ... ... ...
EXT_SOURCE_2 float16 Numerical 6888 [0.262939453125, 0.6220703125, 0.55615234375, ... 660 0.214626
DAYS_EMPLOYED int32 Numerical 12574 [-637, -1188, -225, -3039, -3038, -1588, -3130... 0 0.0
AMT_ANNUITY float32 Numerical 13672 [24700.5, 35698.5, 6750.0, 29686.5, 21865.5, 2... 12 0.003902
DAYS_BIRTH int16 Numerical 17460 [-9461, -16765, -19046, -19005, -19932, -16941... 0 0.0
SK_ID_CURR int32 Numerical 307511 [100002, 100003, 100004, 100006, 100007, 10000... 0 0.0

122 rows × 6 columns

Extraction des informations sur le lieu de vie du client

In [61]:
mod_medi_avg_col_list = [col for col in application.columns if (
    col.endswith('_MODE') or col.endswith('_MEDI') or col.endswith('AVG'))]
total_list = mod_medi_avg_col_list 
env_application = application[total_list + ['TARGET']]

del mod_medi_avg_col_list, 
gc.collect()

summary_df = eda.df_info(env_application)
summary_df.sort_values(by='perc_missing', ascending=False)
Out[61]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
COMMONAREA_MEDI float16 Numerical 2849 [0.01439666748046875, 0.060791015625, nan, 0.0... 214865 69.872297
COMMONAREA_AVG float16 Numerical 2841 [0.0142974853515625, 0.06048583984375, nan, 0.... 214865 69.872297
COMMONAREA_MODE float16 Numerical 2808 [0.01439666748046875, 0.049713134765625, nan, ... 214865 69.872297
NONLIVINGAPARTMENTS_MODE float16 Numerical 167 [0.0, nan, 0.019500732421875, 0.00780105590820... 213514 69.432963
NONLIVINGAPARTMENTS_AVG float16 Numerical 386 [0.0, 0.0039005279541015625, nan, 0.0193023681... 213514 69.432963
NONLIVINGAPARTMENTS_MEDI float16 Numerical 214 [0.0, 0.0039005279541015625, nan, 0.0193939208... 213514 69.432963
FONDKAPREMONT_MODE category category 4 [reg oper account, nan, org spec account, reg ... 210295 68.386172
LIVINGAPARTMENTS_MODE float16 Numerical 736 [0.022003173828125, 0.0789794921875, nan, 0.13... 210199 68.354953
LIVINGAPARTMENTS_AVG float16 Numerical 1840 [0.02020263671875, 0.0772705078125, nan, 0.120... 210199 68.354953
LIVINGAPARTMENTS_MEDI float16 Numerical 1097 [0.0204925537109375, 0.07867431640625, nan, 0.... 210199 68.354953
FLOORSMIN_MODE float16 Numerical 25 [0.125, 0.333251953125, nan, 0.375, 0.70849609... 208642 67.84863
FLOORSMIN_MEDI float16 Numerical 47 [0.125, 0.333251953125, nan, 0.375, 0.70849609... 208642 67.84863
FLOORSMIN_AVG float16 Numerical 304 [0.125, 0.333251953125, nan, 0.375, 0.70849609... 208642 67.84863
YEARS_BUILD_MEDI float16 Numerical 151 [0.62451171875, 0.798828125, nan, 0.73828125, ... 204488 66.497784
YEARS_BUILD_MODE float16 Numerical 154 [0.63427734375, 0.80419921875, nan, 0.74511718... 204488 66.497784
YEARS_BUILD_AVG float16 Numerical 149 [0.619140625, 0.7958984375, nan, 0.73486328125... 204488 66.497784
LANDAREA_MODE float16 Numerical 3031 [0.037689208984375, 0.0128021240234375, nan, 0... 182590 59.376738
LANDAREA_AVG float16 Numerical 3011 [0.036895751953125, 0.01300048828125, nan, 0.0... 182590 59.376738
LANDAREA_MEDI float16 Numerical 3041 [0.037506103515625, 0.0131988525390625, nan, 0... 182590 59.376738
BASEMENTAREA_MODE float16 Numerical 3148 [0.038299560546875, 0.053802490234375, nan, 0.... 179943 58.515956
BASEMENTAREA_MEDI float16 Numerical 3121 [0.036895751953125, 0.052886962890625, nan, 0.... 179943 58.515956
BASEMENTAREA_AVG float16 Numerical 3127 [0.036895751953125, 0.052886962890625, nan, 0.... 179943 58.515956
NONLIVINGAREA_AVG float16 Numerical 2885 [0.0, 0.00980377197265625, nan, 0.10009765625,... 169682 55.179164
NONLIVINGAREA_MEDI float16 Numerical 2923 [0.0, 0.01000213623046875, nan, 0.102172851562... 169682 55.179164
NONLIVINGAREA_MODE float16 Numerical 2924 [0.0, nan, 0.10601806640625, 0.00469970703125,... 169682 55.179164
ELEVATORS_AVG float16 Numerical 257 [0.0, 0.08001708984375, nan, 0.1600341796875, ... 163891 53.29598
ELEVATORS_MEDI float16 Numerical 46 [0.0, 0.08001708984375, nan, 0.1600341796875, ... 163891 53.29598
ELEVATORS_MODE float16 Numerical 26 [0.0, 0.08062744140625, nan, 0.1611328125, 0.4... 163891 53.29598
WALLSMATERIAL_MODE category category 7 [Stone, brick, Block, nan, Panel, Mixed, Woode... 156341 50.840783
APARTMENTS_AVG float16 Numerical 2251 [0.0247039794921875, 0.09588623046875, nan, 0.... 156061 50.749729
APARTMENTS_MODE float16 Numerical 760 [0.02520751953125, 0.0924072265625, nan, 0.083... 156061 50.749729
APARTMENTS_MEDI float16 Numerical 1148 [0.024993896484375, 0.0968017578125, nan, 0.08... 156061 50.749729
ENTRANCES_MEDI float16 Numerical 46 [0.0689697265625, 0.03448486328125, nan, 0.206... 154828 50.348768
ENTRANCES_MODE float16 Numerical 30 [0.0689697265625, 0.03448486328125, nan, 0.206... 154828 50.348768
ENTRANCES_AVG float16 Numerical 285 [0.0689697265625, 0.03448486328125, nan, 0.206... 154828 50.348768
LIVINGAREA_MEDI float16 Numerical 3741 [0.0193023681640625, 0.0557861328125, nan, 0.0... 154350 50.193326
LIVINGAREA_MODE float16 Numerical 3746 [0.019805908203125, 0.055389404296875, nan, 0.... 154350 50.193326
LIVINGAREA_AVG float16 Numerical 3715 [0.0189971923828125, 0.054901123046875, nan, 0... 154350 50.193326
HOUSETYPE_MODE category category 3 [block of flats, nan, terraced house, specific... 154297 50.176091
FLOORSMAX_MODE float16 Numerical 25 [0.08331298828125, 0.291748046875, nan, 0.1667... 153020 49.760822
FLOORSMAX_MEDI float16 Numerical 49 [0.08331298828125, 0.291748046875, nan, 0.1667... 153020 49.760822
FLOORSMAX_AVG float16 Numerical 403 [0.08331298828125, 0.291748046875, nan, 0.1667... 153020 49.760822
YEARS_BEGINEXPLUATATION_AVG float16 Numerical 284 [0.97216796875, 0.98486328125, nan, 0.98095703... 150007 48.781019
YEARS_BEGINEXPLUATATION_MEDI float16 Numerical 244 [0.97216796875, 0.98486328125, nan, 0.98095703... 150007 48.781019
YEARS_BEGINEXPLUATATION_MODE float16 Numerical 221 [0.97216796875, 0.98486328125, nan, 0.98095703... 150007 48.781019
TOTALAREA_MODE float16 Numerical 3660 [0.01490020751953125, 0.0714111328125, nan, 0.... 148431 48.268517
EMERGENCYSTATE_MODE category category 2 [No, nan, Yes] 145755 47.398304
TARGET int8 Numerical 2 [1, 0] 0 0.0

Les informations sur l'environnement où vit le client sont d'une façon générale peu renseignées, et sont soit redondantes avec des informations plus pertinentes sur les revenus du client, soit non porteuses d'information.

In [62]:
application = application.drop(columns=total_list)
del total_list
gc.collect()
Out[62]:
0

Traitement des variables catégorielles¶

Dans l'optique de faire en sorte que les dimensions ne soient pas trop grandes lors de l'encoding, je traite ici les variables qualitatives. Je considère comme non-discriminantes des variables qualitatives ayant une valeur sur-représentée pour les deux targets. Par ailleurs, je vais regrouper sous la même appellation les valeurs les moins fréquentes.

In [63]:
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(application))

print ("Valeurs les plus fréquentes pour les variables qualitatives pour Target = 0")
print(list_overfrequent_category_values(application[application['TARGET']==0],0.9))
print ("Valeurs les plus fréquentes pour les variables qualitatives pour Target = 1")
print(list_overfrequent_category_values(application[application['TARGET']==1],0.9))

print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(application,0.01))
Liste des colonnes à valeurs catégorielles binaires
['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']
Valeurs les plus fréquentes pour les variables qualitatives pour Target = 0
NAME_CONTRACT_TYPE
{'NAME_CONTRACT_TYPE': ['Cash loans']}
Valeurs les plus fréquentes pour les variables qualitatives pour Target = 1
NAME_CONTRACT_TYPE
{'NAME_CONTRACT_TYPE': ['Cash loans']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_TYPE_SUITE
NAME_INCOME_TYPE
NAME_HOUSING_TYPE
OCCUPATION_TYPE
ORGANIZATION_TYPE
{'NAME_TYPE_SUITE': ['Other_B', nan, 'Other_A', 'Group of people'], 'NAME_INCOME_TYPE': ['Unemployed', 'Student', 'Businessman', 'Maternity leave'], 'NAME_HOUSING_TYPE': ['Office apartment', 'Co-op apartment'], 'OCCUPATION_TYPE': ['Private service staff', 'Low-skill Laborers', 'Waiters/barmen staff', 'Secretaries', 'Realty agents', 'HR staff', 'IT staff'], 'ORGANIZATION_TYPE': ['Housing', 'Industry: type 11', 'Military', 'Bank', 'Agriculture', 'Police', 'Transport: type 2', 'Postal', 'Security Ministries', 'Trade: type 2', 'Restaurant', 'Services', 'University', 'Industry: type 7', 'Transport: type 3', 'Industry: type 1', 'Hotel', 'Electricity', 'Industry: type 4', 'Trade: type 6', 'Industry: type 5', 'Insurance', 'Telecom', 'Emergency', 'Industry: type 2', 'Advertising', 'Realtor', 'Culture', 'Industry: type 12', 'Trade: type 1', 'Mobile', 'Legal Services', 'Cleaning', 'Transport: type 1', 'Industry: type 6', 'Industry: type 10', 'Religion', 'Industry: type 13', 'Trade: type 4', 'Trade: type 5', 'Industry: type 8']}

Définition d'une fonction destinée à "condenser" les variables qualitatives en diminuant le nombre de modes.

In [64]:
def condense_category_app(df):
    print("Application samples: {}".format(len(df)))
   
    df['NAME_TYPE_SUITE'] = df['NAME_TYPE_SUITE'].replace(['Other_B', 'Other_A', 'Group of people'], 'Unfrequent')
   
    df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].replace(['Unemployed', 'Student', 'Businessman', 'Maternity leave'], 'Unfrequent')

    df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].replace(
        ['Private service staff', 'Low-skill Laborers', 'Waiters/barmen staff', 'Secretaries',
         'Realty agents', 'HR staff', 'IT staff'], 'Unfrequent')

    df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].replace(
        ['Housing', 'Industry: type 11', 'Military', 'Bank', 'Agriculture', 'Police', 'Transport: type 2',
         'Postal', 'Security Ministries', 'Trade: type 2', 'Restaurant', 'Services', 'University', 'Industry: type 7',
         'Transport: type 3', 'Industry: type 1', 'Hotel', 'Electricity', 'Industry: type 4', 'Trade: type 6',
         'Industry: type 5', 'Insurance', 'Telecom', 'Emergency', 'Industry: type 2', 'Advertising',
         'Realtor', 'Culture', 'Industry: type 12', 'Trade: type 1', 'Mobile', 'Legal Services',
         'Cleaning', 'Transport: type 1', 'Industry: type 6', 'Industry: type 10', 'Religion',
         'Industry: type 13', 'Trade: type 4', 'Trade: type 5', 'Industry: type 8'], 'Unfrequent')

    return df
In [65]:
show_infos(application[application['FLAG_OWN_CAR'] == 'Y'][['OWN_CAR_AGE','TARGET']])
Out[65]:
name total_null nulls for target=0 nulls for target=1 % nulls for target=0 % nulls for target=1 type
0 OWN_CAR_AGE 5 5 0 0.01 0.0 float16
1 TARGET 0 0 0 0.00 0.0 int8

Feature engineering¶

In [66]:
def fe_application(df, nan_as_category=True):

    # Suppression de colonnes que je juge non-pertinentes, en particulier CODE_GENDER
    df = df.drop(columns=['NAME_CONTRACT_TYPE','CODE_GENDER','DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE','DAYS_ID_PUBLISH',
    'WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','EXT_SOURCE_1','AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','OBS_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE'])

    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
         df[bin_feature], uniques = pd.factorize(df[bin_feature])

    df = condense_category_app(df)

     # Ajout de l'âge

    df['AGE'] = round(-df.DAYS_BIRTH/365)

    # Mise à NaN de la valeur par défaut de DAYS_EMPLOYED

    df['DAYS_EMPLOYED'].replace(3.652430e+05, np.nan, inplace=True)

   
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']

    df = df.drop(columns=['DAYS_BIRTH'])

    # Categorical features with One-Hot encoder
    df, _ = one_hot_encoder(df, nan_as_category)

    gc.collect()
    return df
In [67]:
application = fe_application(application)
Application samples: 307511
In [68]:
show_infos(application)
Out[68]:
name total_null nulls for target=0 nulls for target=1 % nulls for target=0 % nulls for target=1 type
11 OWN_CAR_AGE 202929 185680 17249 65.68 69.48 float16
28 EXT_SOURCE_3 60965 55288 5677 19.56 22.87 float16
55 DAYS_EMPLOYED_PERC 55374 52384 2990 18.53 12.04 float64
10 DAYS_EMPLOYED 55374 52384 2990 18.53 12.04 float64
51 AMT_REQ_CREDIT_BUREAU_MON 41519 37227 4292 13.17 17.29 float16
... ... ... ... ... ... ... ...
41 FLAG_DOCUMENT_12 0 0 0 0.00 0.00 int8
40 FLAG_DOCUMENT_11 0 0 0 0.00 0.00 int8
39 FLAG_DOCUMENT_10 0 0 0 0.00 0.00 int8
38 FLAG_DOCUMENT_9 0 0 0 0.00 0.00 int8
123 ORGANIZATION_TYPE_nan 0 0 0 0.00 0.00 uint8

124 rows × 7 columns

In [69]:
summary_df = eda.df_info(application)
summary_df[summary_df['num_missing'] > 0].sort_values(by='perc_missing', ascending=False)
Out[69]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
OWN_CAR_AGE float16 Numerical 62 [nan, 26.0, 17.0, 8.0, 23.0, 7.0, 14.0, 1.0, 3... 202929 65.99081
EXT_SOURCE_3 float16 Numerical 814 [0.139404296875, nan, 0.7294921875, 0.62109375... 60965 19.825307
DAYS_EMPLOYED float64 Numerical 12573 [-637.0, -1188.0, -225.0, -3039.0, -3038.0, -1... 55374 18.007161
DAYS_EMPLOYED_PERC float64 Numerical 249434 [0.0673290349857309, 0.07086191470325082, 0.01... 55374 18.007161
AMT_REQ_CREDIT_BUREAU_MON float16 Numerical 24 [0.0, nan, 1.0, 2.0, 6.0, 5.0, 3.0, 7.0, 9.0, ... 41519 13.501631
AMT_REQ_CREDIT_BUREAU_QRT float16 Numerical 11 [0.0, nan, 1.0, 2.0, 4.0, 3.0, 8.0, 5.0, 6.0, ... 41519 13.501631
AMT_REQ_CREDIT_BUREAU_YEAR float16 Numerical 25 [1.0, 0.0, nan, 2.0, 4.0, 5.0, 3.0, 8.0, 6.0, ... 41519 13.501631
DEF_30_CNT_SOCIAL_CIRCLE float16 Numerical 10 [2.0, 0.0, 1.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ... 1021 0.332021
DEF_60_CNT_SOCIAL_CIRCLE float16 Numerical 9 [2.0, 0.0, 1.0, nan, 3.0, 5.0, 4.0, 7.0, 24.0,... 1021 0.332021
EXT_SOURCE_2 float16 Numerical 6888 [0.262939453125, 0.6220703125, 0.55615234375, ... 660 0.214626
AMT_GOODS_PRICE float32 Numerical 1002 [351000.0, 1129500.0, 135000.0, 297000.0, 5130... 278 0.090403
AMT_ANNUITY float32 Numerical 13672 [24700.5, 35698.5, 6750.0, 29686.5, 21865.5, 2... 12 0.003902
ANNUITY_INCOME_PERC float32 Numerical 88804 [0.1219777762889862, 0.1322166621685028, 0.100... 12 0.003902
PAYMENT_RATE float32 Numerical 39101 [0.06074926629662514, 0.027598323300480843, 0.... 12 0.003902
CNT_FAM_MEMBERS float16 Numerical 17 [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 9.0, 7.0, 8.0, ... 2 0.00065
INCOME_PER_PERSON float32 Numerical 3064 [202500.0, 135000.0, 67500.0, 121500.0, 49500.... 2 0.00065

Imputation des valeurs manquantes¶

J'ai choisi de faire une imputation des valeurs vides par la médiane

In [70]:
columns = application.columns

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
values = imp_median.fit_transform(application)
application = pd.DataFrame(values, columns=columns)
In [71]:
reduce_mem_usage(application)
Memory usage of dataframe is 290.92 MB
Memory usage after optimization is: 76.25 MB
Decreased by 73.8%

Other tables¶

In [72]:
def remove_absent_id_curr(df):
    initial_len = len(df)
    df = df[df.SK_ID_CURR.isin(application.SK_ID_CURR)]
    print('%s rows have been removed' %(initial_len-len(df)))

Bureau et Bureau Balance¶

Bureau Balance¶

In [73]:
bureau_balance = read_data(data_dir+'bureau_balance.csv')
Shape of the dataframe :(27299925, 3)
Memory usage of dataframe is 624.85 MB
Memory usage after optimization is: 156.21 MB
Decreased by 75.0%
In [74]:
eda.df_info(bureau_balance)
Out[74]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
SK_ID_BUREAU int32 Numerical 817395 [5715448, 5715449, 5715451, 5715452, 5715453, ... 0 0.0
MONTHS_BALANCE int8 Numerical 97 [0, -1, -2, -3, -4, -5, -6, -7, -8, -9] 0 0.0
STATUS category category 8 [C, 0, X, 1, 2, 3, 5, 4] 0 0.0
Traitement des variables catégorielles¶
In [75]:
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(bureau_balance))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(bureau_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(bureau_balance,0.01))
Liste des colonnes à valeurs catégorielles binaires
[]
Valeurs les plus fréquentes pour les variables qualitatives
{}
Valeurs les moins fréquentes pour les variables qualitatives
STATUS
{'STATUS': ['1', '5', '2', '3', '4']}

Bureau¶

In [76]:
bureau = read_data(data_dir+'bureau.csv')

initial_len = len(bureau)
bureau = bureau[bureau.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(bureau)))
Shape of the dataframe :(1716428, 17)
Memory usage of dataframe is 222.62 MB
Memory usage after optimization is: 78.57 MB
Decreased by 64.7%
251103 rows have been removed
In [77]:
eda.df_info(bureau).sort_values(by='num_unique_values', ascending=True)
Out[77]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
CREDIT_ACTIVE category category 4 [Closed, Active, Sold, Bad debt] 0 0.0
CREDIT_CURRENCY category category 4 [currency 1, currency 2, currency 4, currency 3] 0 0.0
CNT_CREDIT_PROLONG int8 Numerical 10 [0, 2, 1, 4, 3, 5, 9, 8, 6, 7] 0 0.0
CREDIT_TYPE category category 15 [Consumer credit, Credit card, Mortgage, Car l... 0 0.0
CREDIT_DAY_OVERDUE int16 Numerical 893 [0, 2603, 6, 30, 2156, 496, 186, 2264, 41, 8] 0 0.0
AMT_CREDIT_SUM_OVERDUE float32 Numerical 1440 [0.0, 231.52499389648438, 288.0, 58.5, 504.0, ... 0 0.0
DAYS_ENDDATE_FACT float16 Numerical 2486 [-153.0, nan, -1710.0, -840.0, -825.0, -187.0,... 544673 37.170798
DAYS_CREDIT int16 Numerical 2923 [-497, -208, -203, -629, -273, -43, -1896, -11... 0 0.0
DAYS_CREDIT_UPDATE int32 Numerical 2980 [-131, -20, -16, -21, -31, -22, -1710, -840, -... 0 0.0
DAYS_CREDIT_ENDDATE float16 Numerical 6999 [-153.0, 1075.0, 528.0, nan, 1197.0, 27456.0, ... 89098 6.080426
AMT_ANNUITY float32 Numerical 30103 [nan, 0.0, 2691.0, 24462.0, 8181.0, 8061.20996... 1130013 77.116885
AMT_CREDIT_SUM_LIMIT float32 Numerical 45290 [nan, 108982.6171875, 0.0, 228320.09375, 411.6... 489670 33.41716
AMT_CREDIT_MAX_OVERDUE float32 Numerical 61066 [nan, 77674.5, 0.0, 14985.0, 310.5, 20493.2695... 948545 64.732738
AMT_CREDIT_SUM_DEBT float32 Numerical 202909 [0.0, 171342.0, nan, 71017.3828125, 42103.8007... 223094 15.224882
AMT_CREDIT_SUM float32 Numerical 211513 [91323.0, 225000.0, 464323.5, 90000.0, 2700000... 3 0.000205
SK_ID_CURR int32 Numerical 263491 [215354, 162297, 402440, 238881, 222183, 42615... 0 0.0
SK_ID_BUREAU int32 Numerical 1465325 [5714462, 5714463, 5714464, 5714465, 5714466, ... 0 0.0
Traitement des variables catégorielles¶
In [78]:
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(bureau))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(bureau,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(bureau,0.01))
Liste des colonnes à valeurs catégorielles binaires
[]
Valeurs les plus fréquentes pour les variables qualitatives
CREDIT_CURRENCY
{'CREDIT_CURRENCY': ['currency 1']}
Valeurs les moins fréquentes pour les variables qualitatives
CREDIT_ACTIVE
CREDIT_CURRENCY
CREDIT_TYPE
{'CREDIT_ACTIVE': ['Sold', 'Bad debt'], 'CREDIT_CURRENCY': ['currency 2', 'currency 3', 'currency 4'], 'CREDIT_TYPE': ['Microloan', 'Loan for business development', 'Another type of loan', 'Unknown type of loan', 'Loan for working capital replenishment', 'Cash loan (non-earmarked)', 'Real estate loan', 'Loan for the purchase of equipment', 'Loan for purchase of shares (margin lending)', 'Interbank credit', 'Mobile operator loan']}
Feature engineering et merge¶
In [79]:
def fe_bureau_and_balance(bureau, bb, nan_as_category=True):
    bb['MONTHS_BALANCE'] = -bb['MONTHS_BALANCE']
    bb['STATUS'] = bb['STATUS'].replace(['1','5', '2', '3', '4'], '1_5')
    bureau['CREDIT_TYPE'] = bureau['CREDIT_TYPE'].replace(['Loan for business development',
                                            'Another type of loan',
                                            'Unknown type of loan',
                                            'Loan for working capital replenishment',
                                            'Cash loan (non-earmarked)',
                                            'Real estate loan',
                                            'Loan for the purchase of equipment',
                                            'Loan for purchase of shares (margin lending)',
                                            'Interbank credit',
                                            'Mobile operator loan'], 'Unfrequent')
    
    bureau.drop(columns=['CREDIT_CURRENCY'], inplace=True)
    
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']

    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper()
                              for e in bb_agg.columns.tolist()])

    # Encodage des catégories
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: merge with bureau.csv 
    #                    
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
    del bb, bb_agg
    gc.collect()


    # Bureau and bureau_balance numeric features
    
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features

    cat_aggregations = {}

    for cat in bureau_cat:
        cat_aggregations[cat] = ['mean']

    for cat in bb_cat:
        cat_aggregations[cat + "_MEAN"] = ['mean']
    

    bureau_agg = bureau.groupby('SK_ID_CURR').agg(
        {**num_aggregations, **cat_aggregations})

    bureau_agg.columns = pd.Index(
        ['BUREAU_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    # Bureau: Active credits - using only numerical aggregations

    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(
        ['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()

    # Bureau: Closed credits - using only numerical aggregations

    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(
        ['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg
In [80]:
bureau_and_bb = fe_bureau_and_balance(bureau, bureau_balance)
del bureau, bureau_balance
gc.collect()
reduce_mem_usage(bureau_and_bb)
Memory usage of dataframe is 144.49 MB
Memory usage after optimization is: 70.61 MB
Decreased by 51.1%

Précédentes demandes de crédit¶

Previous application¶

In [81]:
previous_application = read_data(data_dir+'previous_application.csv')
initial_len = len(previous_application)
previous_application = previous_application[previous_application.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows with SK_ID_CURR not appearing in Application have been removed' %(initial_len-len(previous_application)))
Shape of the dataframe :(1670214, 37)
Memory usage of dataframe is 471.48 MB
Memory usage after optimization is: 130.62 MB
Decreased by 72.3%
256513 rows with SK_ID_CURR not appearing in Application have been removed
Traitement des variables catégorielles¶
In [82]:
print ("Liste des colonnes à valeurs catégorielles binaires")
print(list_binary_cat_cols(previous_application))
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(previous_application,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(previous_application, 0.01))
Liste des colonnes à valeurs catégorielles binaires
['FLAG_LAST_APPL_PER_CONTRACT']
Valeurs les plus fréquentes pour les variables qualitatives
FLAG_LAST_APPL_PER_CONTRACT
{'FLAG_LAST_APPL_PER_CONTRACT': ['Y']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CASH_LOAN_PURPOSE
NAME_PAYMENT_TYPE
CODE_REJECT_REASON
NAME_TYPE_SUITE
NAME_GOODS_CATEGORY
CHANNEL_TYPE
NAME_SELLER_INDUSTRY
PRODUCT_COMBINATION
{'NAME_CASH_LOAN_PURPOSE': ['Other', 'Urgent needs', 'Buying a used car', 'Building a house or an annex', 'Everyday expenses', 'Medicine', 'Payments on other loans', 'Education', 'Journey', 'Purchase of electronic equipment', 'Buying a new car', 'Wedding / gift / holiday', 'Buying a home', 'Car repairs', 'Furniture', 'Buying a holiday home / land', 'Business development', 'Gasification / water supply', 'Buying a garage', 'Hobby', 'Money for a third person', 'Refusal to name the goal'], 'NAME_PAYMENT_TYPE': ['Non-cash from your account', 'Cashless from the account of the employer'], 'CODE_REJECT_REASON': ['SCOFR', 'XNA', 'VERIF', 'SYSTEM'], 'NAME_TYPE_SUITE': ['Other_A', 'Group of people'], 'NAME_GOODS_CATEGORY': ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', 'Other', 'Gardening', 'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', 'Education', 'Weapon', 'Insurance', 'Animals', 'House Construction'], 'CHANNEL_TYPE': ['Channel of corporate sales', 'Car dealer'], 'NAME_SELLER_INDUSTRY': ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism'], 'PRODUCT_COMBINATION': ['POS industry without interest', 'POS others without interest', nan]}
In [83]:
show_category_details(previous_application)
NAME_CONTRACT_TYPE
Cash loans         0.443350
Consumer loans     0.442283
Revolving loans    0.114146
XNA                0.000221
Name: NAME_CONTRACT_TYPE, dtype: float64
WEEKDAY_APPR_PROCESS_START
WEDNESDAY    0.152450
TUESDAY      0.152124
MONDAY       0.151736
FRIDAY       0.150932
THURSDAY     0.149257
SATURDAY     0.144415
SUNDAY       0.099086
Name: WEEKDAY_APPR_PROCESS_START, dtype: float64
FLAG_LAST_APPL_PER_CONTRACT
Y    0.994826
N    0.005174
Name: FLAG_LAST_APPL_PER_CONTRACT, dtype: float64
NAME_CASH_LOAN_PURPOSE
XAP                                 0.556650
XNA                                 0.401323
Repairs                             0.014230
Other                               0.009501
Urgent needs                        0.005118
Buying a used car                   0.001746
Building a house or an annex        0.001658
Everyday expenses                   0.001452
Medicine                            0.001323
Payments on other loans             0.001113
Education                           0.000944
Journey                             0.000737
Purchase of electronic equipment    0.000661
Buying a new car                    0.000627
Wedding / gift / holiday            0.000567
Buying a home                       0.000496
Car repairs                         0.000489
Furniture                           0.000467
Buying a holiday home / land        0.000328
Business development                0.000254
Gasification / water supply         0.000178
Buying a garage                     0.000082
Hobby                               0.000032
Money for a third person            0.000016
Refusal to name the goal            0.000009
Name: NAME_CASH_LOAN_PURPOSE, dtype: float64
NAME_CONTRACT_STATUS
Approved        0.626794
Canceled        0.183519
Refused         0.173580
Unused offer    0.016107
Name: NAME_CONTRACT_STATUS, dtype: float64
NAME_PAYMENT_TYPE
Cash through the bank                        0.624389
XNA                                          0.369923
Non-cash from your account                   0.005022
Cashless from the account of the employer    0.000666
Name: NAME_PAYMENT_TYPE, dtype: float64
CODE_REJECT_REASON
XAP       0.810308
HC        0.103264
LIMIT     0.033793
SCO       0.023086
CLIENT    0.016107
SCOFR     0.007693
XNA       0.003097
VERIF     0.002178
SYSTEM    0.000475
Name: CODE_REJECT_REASON, dtype: float64
NAME_TYPE_SUITE
NaN                0.491385
Unaccompanied      0.302493
Family             0.128655
Spouse, partner    0.040734
Children           0.019174
Other_B            0.010662
Other_A            0.005532
Group of people    0.001365
Name: NAME_TYPE_SUITE, dtype: float64
NAME_CLIENT_TYPE
Repeater     0.734018
New          0.183589
Refreshed    0.081301
XNA          0.001091
Name: NAME_CLIENT_TYPE, dtype: float64
NAME_GOODS_CATEGORY
XNA                         5.639163e-01
Mobile                      1.370467e-01
Consumer Electronics        7.412105e-02
Computers                   6.360822e-02
Audio/Video                 6.048096e-02
Furniture                   3.214187e-02
Photo / Cinema Equipment    1.535473e-02
Construction Materials      1.510645e-02
Clothing and Accessories    1.385724e-02
Auto Accessories            4.405458e-03
Jewelry                     3.669800e-03
Homewares                   3.048028e-03
Medical Supplies            2.291857e-03
Vehicles                    2.018814e-03
Sport and Leisure           1.796702e-03
Other                       1.653108e-03
Gardening                   1.642497e-03
Office Appliances           1.404116e-03
Tourism                     9.365488e-04
Medicine                    9.209868e-04
Direct Sales                2.334298e-04
Fitness                     1.174223e-04
Additional Service          7.710258e-05
Education                   6.295532e-05
Weapon                      4.597861e-05
Insurance                   4.102706e-05
Animals                     7.073632e-07
House Construction          0.000000e+00
Name: NAME_GOODS_CATEGORY, dtype: float64
NAME_PORTFOLIO
POS      0.419084
Cash     0.277048
XNA      0.217311
Cards    0.086288
Cars     0.000269
Name: NAME_PORTFOLIO, dtype: float64
NAME_PRODUCT_TYPE
XNA        0.636664
x-sell     0.272614
walk-in    0.090722
Name: NAME_PRODUCT_TYPE, dtype: float64
CHANNEL_TYPE
Credit and cash offices       0.424660
Country-wide                  0.299258
Stone                         0.129807
Regional / Local              0.065411
Contact center                0.041657
AP+ (Cash loan)               0.035102
Channel of corporate sales    0.003817
Car dealer                    0.000287
Name: CHANNEL_TYPE, dtype: float64
NAME_SELLER_INDUSTRY
XNA                     0.505992
Consumer electronics    0.241707
Connectivity            0.168600
Furniture               0.034637
Construction            0.017979
Clothing                0.014074
Industry                0.011719
Auto technology         0.002886
Jewelry                 0.001490
MLM partners            0.000651
Tourism                 0.000265
Name: NAME_SELLER_INDUSTRY, dtype: float64
NAME_YIELD_GROUP
XNA           0.303599
middle        0.229873
high          0.216496
low_normal    0.194443
low_action    0.055590
Name: NAME_YIELD_GROUP, dtype: float64
PRODUCT_COMBINATION
Cash                              0.166302
POS household with interest       0.160192
POS mobile with interest          0.135000
Cash X-Sell: middle               0.084909
Cash X-Sell: low                  0.078234
Card Street                       0.066912
POS industry with interest        0.059070
POS household without interest    0.050590
Card X-Sell                       0.047233
Cash Street: high                 0.036570
Cash X-Sell: high                 0.035935
Cash Street: middle               0.021174
Cash Street: low                  0.020226
POS other with interest           0.014366
POS mobile without interest       0.014069
POS industry without interest     0.007513
POS others without interest       0.001482
NaN                               0.000221
Name: PRODUCT_COMBINATION, dtype: float64

Fonction qui permet de "condenser" les variables qualitatives en réduisant le nombre de modes

In [84]:
def condense_category_values(previous_application):
    a = ['Family','Spouse, partner','Children']
    previous_application["NAME_TYPE_SUITE"] = previous_application["NAME_TYPE_SUITE"].replace(a, 'Family')
    a_bis = ['Other_B', 'Other_A', 'Group of people']
    previous_application["NAME_TYPE_SUITE"] = previous_application["NAME_TYPE_SUITE"].replace(a_bis, 'Other_company')

    # NAME_GOODS_CATEGORY
    a = ['Auto Accessories', 'Jewelry', 'Homewares', 'Medical Supplies', 'Vehicles', 'Sport and Leisure', 'Other', 'Gardening',
         'Office Appliances', 'Tourism', 'Medicine', 'Direct Sales', 'Fitness', 'Additional Service', 'Education', 'Weapon',
         'Insurance', 'Animals', 'House Construction']

    previous_application["NAME_GOODS_CATEGORY"] = previous_application["NAME_GOODS_CATEGORY"].replace(
        a, 'Other')

    # NAME_CASH_LOAN_PURPOSE
    a = ['Buying a used car', 'Building a house or an annex', 'Everyday expenses', 'Medicine', 'Payments on other loans',
         'Education', 'Journey', 'Purchase of electronic equipment', 'Buying a new car', 'Wedding / gift / holiday', 'Buying a home',
         'Car repairs', 'Furniture', 'Buying a holiday home / land', 'Business development', 'Gasification / water supply',
         'Buying a garage', 'Hobby', 'Money for a third person', 'Refusal to name the goal']

    previous_application["NAME_CASH_LOAN_PURPOSE"] = previous_application["NAME_CASH_LOAN_PURPOSE"].replace(
        a, 'Other')

    # CODE_REJECT_REASON
    a = ['XNA', 'VERIF', 'SYSTEM']

    previous_application["CODE_REJECT_REASON"] = previous_application["CODE_REJECT_REASON"].replace(
        a, 'XNA_VERIF_SYSTEM')

    # CHANNEL_TYPE
    a = ['Channel of corporate sales', 'Car dealer']
    previous_application["CHANNEL_TYPE"] = previous_application["CHANNEL_TYPE"].replace(
        a, 'Other_Channel')

    # NAME_SELLER_INDUSTRY
    a = ['Auto technology', 'Jewelry', 'MLM partners', 'Tourism']
    previous_application["NAME_SELLER_INDUSTRY"] = previous_application["NAME_SELLER_INDUSTRY"].replace(
        a, 'Other_Industry')
    
    return previous_application
Feature engineering¶
In [85]:
def fe_previous_application(prev, nan_as_category=True):
    
    prev = condense_category_values(prev)

    prev.drop(columns=['FLAG_LAST_APPL_PER_CONTRACT','WEEKDAY_APPR_PROCESS_START', 'HOUR_APPR_PROCESS_START'], inplace=True)
    for bin_feature in ['NFLAG_INSURED_ON_APPROVAL', 'NFLAG_LAST_APPL_IN_DAY']:
           prev[bin_feature], uniques = pd.factorize(prev[bin_feature])
    prev, cat_cols = one_hot_encoder(prev, nan_as_category)

    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    # Add feature: value ask / value received percentage

    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['sum', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']

    prev_agg = prev.groupby('SK_ID_CURR').agg(
        {**num_aggregations, **cat_aggregations})

    prev_agg.columns = pd.Index(
        ['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

    # Previous Applications: Approved Applications - only numerical features
    
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(
        ['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')

    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(
        ['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')

    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg
In [86]:
previous_application = fe_previous_application(previous_application)
reduce_mem_usage(previous_application)
Memory usage of dataframe is 312.55 MB
Memory usage after optimization is: 127.68 MB
Decreased by 59.1%

Credit Card Balance¶

In [87]:
credit_card_balance = read_data(data_dir+'credit_card_balance.csv')
initial_len = len(credit_card_balance)
credit_card_balance = credit_card_balance[credit_card_balance.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(credit_card_balance)))
Shape of the dataframe :(3840312, 23)
Memory usage of dataframe is 673.88 MB
Memory usage after optimization is: 263.69 MB
Decreased by 60.9%
612347 rows have been removed
In [88]:
eda.df_info(credit_card_balance)
Out[88]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
SK_ID_PREV int32 Numerical 87452 [2582071, 1389973, 1891521, 2181852, 1235299, ... 0 0.0
SK_ID_CURR int32 Numerical 86905 [363914, 337855, 126868, 367360, 203885, 34033... 0 0.0
MONTHS_BALANCE int8 Numerical 96 [-1, -4, -5, -3, -2, -19, -13, -18, -15, -12] 0 0.0
AMT_BALANCE float32 Numerical 1182264 [63975.5546875, 236572.109375, 453919.46875, 2... 0 0.0
AMT_CREDIT_LIMIT_ACTUAL int32 Numerical 167 [45000, 225000, 450000, 292500, 135000, 270000... 0 0.0
AMT_DRAWINGS_ATM_CURRENT float32 Numerical 2095 [2250.0, 0.0, 90000.0, 76500.0, 10800.0, 4500.... 605754 18.765817
AMT_DRAWINGS_CURRENT float32 Numerical 159227 [2250.0, 11547.0, 289339.4375, 111026.703125, ... 0 0.0
AMT_DRAWINGS_OTHER_CURRENT float32 Numerical 1650 [0.0, 137700.0, nan, 46800.0, 187200.0, 22950.... 605754 18.765817
AMT_DRAWINGS_POS_CURRENT float32 Numerical 143768 [0.0, 11547.0, 199339.421875, 34526.69921875, ... 605754 18.765817
AMT_INST_MIN_REGULARITY float32 Numerical 278880 [2250.0, 11795.759765625, 22924.890625, 130.5,... 264384 8.190423
AMT_PAYMENT_CURRENT float32 Numerical 142261 [2250.0, 11925.0, 27000.0, 4093.514892578125, ... 620093 19.210029
AMT_PAYMENT_TOTAL_CURRENT float32 Numerical 156928 [2250.0, 11925.0, 27000.0, 4093.514892578125, ... 0 0.0
AMT_RECEIVABLE_PRINCIPAL float32 Numerical 1045413 [60175.078125, 224949.28125, 443044.40625, 285... 0 0.0
AMT_RECIVABLE float32 Numerical 1174181 [64875.5546875, 233048.96875, 453919.46875, 28... 0 0.0
AMT_TOTAL_RECEIVABLE float32 Numerical 1174292 [64875.5546875, 233048.96875, 453919.46875, 28... 0 0.0
CNT_DRAWINGS_ATM_CURRENT float16 Numerical 44 [1.0, 0.0, 3.0, 2.0, 5.0, nan, 6.0, 4.0, 8.0, ... 605754 18.765817
CNT_DRAWINGS_CURRENT int16 Numerical 127 [1, 8, 9, 2, 30, 0, 4, 7, 3, 5] 0 0.0
CNT_DRAWINGS_OTHER_CURRENT float16 Numerical 11 [0.0, 1.0, nan, 3.0, 2.0, 4.0, 6.0, 5.0, 7.0, ... 605754 18.765817
CNT_DRAWINGS_POS_CURRENT float16 Numerical 132 [0.0, 1.0, 5.0, 6.0, 29.0, 4.0, 8.0, nan, 14.0... 605754 18.765817
CNT_INSTALMENT_MATURE_CUM float16 Numerical 121 [69.0, 10.0, 101.0, 3.0, 38.0, 6.0, 27.0, 2.0,... 264384 8.190423
NAME_CONTRACT_STATUS category category 7 [Active, Completed, Signed, Demand, Sent propo... 0 0.0
SK_DPD int16 Numerical 844 [0, 2192, 7, 717, 28, 18, 1676, 1311, 1158, 1462] 0 0.0
SK_DPD_DEF int16 Numerical 308 [0, 7, 1, 701, 8, 5, 63, 31, 62, 122] 0 0.0
In [89]:
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(credit_card_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
list_unfrequent_category_values(credit_card_balance,0.001)
Valeurs les plus fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Active']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
Out[89]:
{'NAME_CONTRACT_STATUS': ['Demand', 'Sent proposal', 'Refused', 'Approved']}
Feature engineering¶
In [90]:
def fe_credit_card_balance(cc,nan_as_category=True):
    a = ['Signed','Demand','Sent proposal','Approved']
    cc["NAME_CONTRACT_STATUS"] = cc["NAME_CONTRACT_STATUS"].replace(a, 'Unfrequent')
    cc, cat_cols = one_hot_encoder(cc, nan_as_category)
    cc['MONTHS_BALANCE'] = -cc['MONTHS_BALANCE'] 
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper()
                              for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg
In [91]:
credit_card_balance = fe_credit_card_balance(credit_card_balance)
reduce_mem_usage(credit_card_balance)
Memory usage of dataframe is 43.18 MB
Memory usage after optimization is: 31.41 MB
Decreased by 27.3%

POS CASH Balance¶

In [92]:
POS_CASH_balance = read_data(data_dir+'POS_CASH_balance.csv')
initial_len = len(POS_CASH_balance)
POS_CASH_balance = POS_CASH_balance[POS_CASH_balance.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(POS_CASH_balance)))
Shape of the dataframe :(10001358, 8)
Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 171.69 MB
Decreased by 71.9%
1457983 rows have been removed
In [93]:
eda.df_info(POS_CASH_balance)
Out[93]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
SK_ID_PREV int32 Numerical 800337 [1803195, 1715348, 1784872, 1903291, 2341044, ... 0 0.0
SK_ID_CURR int32 Numerical 289444 [182943, 367990, 397406, 269225, 334279, 34216... 0 0.0
MONTHS_BALANCE int8 Numerical 96 [-31, -33, -32, -35, -38, -39, -34, -37, -41, ... 0 0.0
CNT_INSTALMENT float16 Numerical 72 [48.0, 36.0, 12.0, 24.0, 60.0, 18.0, 4.0, 25.0... 21863 0.255906
CNT_INSTALMENT_FUTURE float16 Numerical 78 [45.0, 35.0, 9.0, 42.0, 12.0, 43.0, 36.0, 16.0... 21878 0.256081
NAME_CONTRACT_STATUS category category 9 [Active, Signed, Completed, Approved, Returned... 0 0.0
SK_DPD int16 Numerical 3358 [0, 1, 2, 4, 3, 18, 7, 5, 12, 6] 0 0.0
SK_DPD_DEF int16 Numerical 1987 [0, 1, 2, 4, 3, 18, 7, 5, 12, 8] 0 0.0
In [94]:
print ("Valeurs les plus fréquentes pour les variables qualitatives")
print(list_overfrequent_category_values(POS_CASH_balance,0.9))
print ("Valeurs les moins fréquentes pour les variables qualitatives")
print(list_unfrequent_category_values(POS_CASH_balance,0.001))
Valeurs les plus fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Active']}
Valeurs les moins fréquentes pour les variables qualitatives
NAME_CONTRACT_STATUS
{'NAME_CONTRACT_STATUS': ['Demand', 'Returned to the store', 'Approved', 'Amortized debt', 'Canceled', 'XNA']}
Feature engineering¶
In [95]:
def fe_pos_cash(pos,  nan_as_category=True):
    a = ['Demand', 'Returned to the store', 'Approved', 'Amortized debt', 'Canceled', 'XNA']
    pos['NAME_CONTRACT_STATUS']= pos['NAME_CONTRACT_STATUS'].replace(a, 'Unfrequent')
    pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)
    
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']

    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(
        ['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
In [96]:
POS_CASH_balance = fe_pos_cash(POS_CASH_balance)
reduce_mem_usage(POS_CASH_balance)
Memory usage of dataframe is 25.67 MB
Memory usage after optimization is: 9.11 MB
Decreased by 64.5%

Installments Payments¶

In [97]:
installments_payments = read_data(data_dir+'installments_payments.csv')
initial_len = len(installments_payments)
installments_payments = installments_payments[installments_payments.SK_ID_CURR.isin(application.SK_ID_CURR)]
print('%s rows have been removed' %(initial_len-len(installments_payments)))
Shape of the dataframe :(13605401, 8)
Memory usage of dataframe is 830.41 MB
Memory usage after optimization is: 311.40 MB
Decreased by 62.5%
2013809 rows have been removed
In [98]:
eda.df_info(installments_payments)
Out[98]:
data_type data_type_grp num_unique_values sample_unique_values num_missing perc_missing
SK_ID_PREV int32 Numerical 853344 [1054186, 1330831, 2085231, 2452527, 2714724, ... 0 0.0
SK_ID_CURR int32 Numerical 291643 [161674, 151639, 193053, 199697, 167756, 16448... 0 0.0
NUM_INSTALMENT_VERSION float16 Numerical 55 [1.0, 0.0, 2.0, 4.0, 3.0, 5.0, 7.0, 8.0, 6.0, ... 0 0.0
NUM_INSTALMENT_NUMBER int16 Numerical 277 [6, 34, 1, 3, 2, 12, 11, 4, 14, 8] 0 0.0
DAYS_INSTALMENT float16 Numerical 2485 [-1180.0, -2156.0, -63.0, -2418.0, -1383.0, -1... 0 0.0
DAYS_ENTRY_PAYMENT float16 Numerical 2554 [-1187.0, -2156.0, -63.0, -2426.0, -1366.0, -1... 2583 0.022283
AMT_INSTALMENT float32 Numerical 831095 [6948.35986328125, 1716.5250244140625, 25425.0... 0 0.0
AMT_PAYMENT float32 Numerical 875277 [6948.35986328125, 1716.5250244140625, 25425.0... 2583 0.022283
Feature engineering¶
In [99]:
def fe_install_payments(ins, nan_as_category=True):
    
    ins, cat_cols = one_hot_encoder(ins, nan_as_category)

    # Percentage and difference paid in each installment (amount paid and installment value)

    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']

    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(
        ['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg
In [100]:
installments_payments = fe_install_payments(installments_payments)
reduce_mem_usage(installments_payments)
Memory usage of dataframe is 39.49 MB
Memory usage after optimization is: 26.42 MB
Decreased by 33.1%

Merge des dataframes¶

In [101]:
print("shape:", application.shape)
with timer("Merge Application with bureau and bureau_balance"):
    
    df = application.merge(bureau_and_bb, how='left', on='SK_ID_CURR')
    print("shape:", application.shape)
    del application
    gc.collect()

with timer("Merge with previous_application"):

    df = df.merge(previous_application, how='left', on='SK_ID_CURR')
    del previous_application
    print("shape:", df.shape)
    gc.collect()
  

with timer("Merge with POS-CASH balance"):
   
    df = df.join(POS_CASH_balance, how='left', on='SK_ID_CURR')
    del POS_CASH_balance
    print("shape:", df.shape)
    gc.collect()

with timer("Merge with installments payments"):
        
    df = df.join(installments_payments, how='left', on='SK_ID_CURR')
    del installments_payments
    print("shape:", df.shape)
    gc.collect()

with timer("Merge with credit card balance"):
    df = df.join(credit_card_balance, how='left', on='SK_ID_CURR')
    del credit_card_balance
    print("shape:", df.shape)
    gc.collect()
shape: (307511, 124)
shape: (307511, 124)
Merge Application with bureau and bureau_balance - done in 2s
shape: (307511, 400)
Merge with previous_application - done in 2s
shape: (307511, 413)
Merge with POS-CASH balance - done in 1s
shape: (307511, 439)
Merge with installments payments - done in 1s
shape: (307511, 565)
Merge with credit card balance - done in 1s

Filtrage des colonnes¶

Suppression des colonnes les plus vides (60%)¶

In [102]:
# Liste des colonnes vides avec plus de 60% de valeurs vides

X = df.drop(columns = ['SK_ID_CURR','TARGET'])

emptiest_columns = list_emptiest_columns(X, 0.6)


df = df.drop(columns=emptiest_columns)

print("Suppression de %s colonnes" %len(emptiest_columns))
Suppression de 173 colonnes

Imputation des valeurs manquantes¶

A ce stade, je préfère imputer les valeurs manquantes par zéro. En effet, l'encodage des variables qualitatives a déjà été fait, avec des colonnes _nan. Une valeur manquante signifie l'inexistence totale de l'individu, à savoir ni une précédente valeur vide, ni non-vide d'où la valeur zéro, de même pour les variables numériques.

In [103]:
# Remplacement des valeurs vides ou infinies par 0

df.replace([np.inf, -np.inf], 0, inplace=True)
df = df.fillna(0)
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307511 entries, 0 to 307510
Columns: 392 entries, SK_ID_CURR to INSTAL_COUNT
dtypes: float16(296), float32(81), float64(15)
memory usage: 306.2 MB
In [104]:
df.to_pickle('../../gen_data/data_before_feature_selection.pkl')

Sélection des colonnes les plus importantes¶

Dans le cadre de la lutte contre le fléau dimensionnel, je sélectionne 100 colonnes les plus importantes avec SelectKbest.

In [105]:
to_ignore = ['SK_ID_CURR','TARGET']
all_columns = df.columns
print("Before ")
print(df.info())
features_to_check = [feature for feature in all_columns if feature not in to_ignore]

X = df[features_to_check]
y = df[['TARGET']]

selector = SelectKBest(f_classif, k = 100).fit(X,y)
k_best_features = selector.get_feature_names_out()

to_keep = list(['SK_ID_CURR','TARGET']) + list(k_best_features) 
df = df[to_keep]

print("After ")
print(df.info())
Before 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307511 entries, 0 to 307510
Columns: 392 entries, SK_ID_CURR to INSTAL_COUNT
dtypes: float16(296), float32(81), float64(15)
memory usage: 306.2 MB
None
After 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 307511 entries, 0 to 307510
Columns: 102 entries, SK_ID_CURR to INSTAL_DAYS_ENTRY_PAYMENT_SUM
dtypes: float16(86), float32(11), float64(5)
memory usage: 77.4 MB
None

Sauvegarde des données qui seront utilisées pour l'apprentissage

In [106]:
df.to_pickle('../../gen_data/data_to_train.pkl')

Binning des données¶

Fonction de condenser les données par feature via binning des valeurs et comptage. Ces données seront utilisées pour comparer le client courant aux autres clients de la base, par feature.

In [107]:
def build_binned_data(data,num_bins) : 

    features = data.drop(columns=['SK_ID_CURR','TARGET']).columns

    grouped_and_binned_data_df = pd.DataFrame([], columns=['feature', 'bin','TARGET','count', 'percent_of_target'])

    for col in features : 
        target_data = data[[col,'TARGET']]
        target_data['feature'] = col
        target_data['bin'], _ = pd.cut(target_data[col], num_bins, retbins=True)
        target_data[['feature','bin', 'TARGET']].value_counts()
        grouped_data = pd.DataFrame(target_data[['feature','bin', 'TARGET']].value_counts())
     
        # Attention, il faut ordonner les bins pour la représentation graphique !

        grouped_data =  grouped_data.reset_index().sort_values(by='bin', ascending=True).rename(columns={0: 'count'})

        # rajouter les pourcentage des target 0 et pourcentages des target 1

        count1 = grouped_data[grouped_data.TARGET == 1]['count'].sum()
        count0 = grouped_data[grouped_data.TARGET == 0]['count'].sum()

        grouped_data.loc[grouped_data['TARGET'] == 1, 'percent_of_target'] = (grouped_data['count']/count1)*100
        grouped_data.loc[grouped_data['TARGET'] == 0, 'percent_of_target'] = (grouped_data['count']/count0)*100
        grouped_and_binned_data_df = pd.concat([grouped_and_binned_data_df, grouped_data])
   
        del target_data, grouped_data, count0, count1
        gc.collect()
    return  grouped_and_binned_data_df

Sauvegarde des données "condensées"

In [108]:
import re
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
In [109]:
binned_data_df = build_binned_data(df, num_bins =10)
binned_data_df.to_pickle('../../gen_data/binned_data.pkl')
In [110]:
binned_data_df
Out[110]:
feature bin TARGET count percent_of_target
0 AMT_CREDIT (40995.0, 445500.0] 0.0 114825 40.619274
5 AMT_CREDIT (40995.0, 445500.0] 1.0 10284 41.425982
1 AMT_CREDIT (445500.0, 846000.0] 0.0 102444 36.239502
4 AMT_CREDIT (445500.0, 846000.0] 1.0 10285 41.430010
2 AMT_CREDIT (846000.0, 1246500.0] 0.0 41676 14.742860
... ... ... ... ... ...
7 INSTAL_DAYS_ENTRY_PAYMENT_SUM (-180746.7, -120497.8] 1.0 881 3.548842
6 INSTAL_DAYS_ENTRY_PAYMENT_SUM (-120497.8, -60248.9] 1.0 2129 8.576032
1 INSTAL_DAYS_ENTRY_PAYMENT_SUM (-120497.8, -60248.9] 0.0 29717 10.512371
2 INSTAL_DAYS_ENTRY_PAYMENT_SUM (-60248.9, 0.0] 1.0 21000 84.592145
0 INSTAL_DAYS_ENTRY_PAYMENT_SUM (-60248.9, 0.0] 0.0 227637 80.526450

1542 rows × 5 columns

In [111]:
! jupyter nbconvert --to html data_cleaning_and_merge.ipynb